https://github.com/GreatLearningAIML1/gl-pgp-aiml-uta-intl-may20-kv2001
Problem Statement: McCurr Consultancy – Attrition Analysis
import pandas as pd # data processing
import numpy as np # linear algebra
import matplotlib.pyplot as plt # for plotting graphs
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data1 = pd.read_excel('HR_Employee_Attrition Dataset.xlsx') #import Excel data
data1.dtypes
data1.shape
data1.describe() #central values (mean and median), standard deviation and quartiles, etc
data1.info()
data1.tail()
data1.isnull().any() # to identify any null values
for col in data1.columns:
print (col, len(data1[col].unique())) # number of unique in each column
CHECK FOR MISSING VALUES
data1.isna().sum()/(len(data1))*100
data1.loc[:, data1.isnull().any()].columns
data1.skew()
Univarite Plots:
sns.distplot(data1['Age']);
sns.distplot(data1['DistanceFromHome']);
sns.distplot(data1['MonthlyIncome']);
sns.distplot(data1['NumCompaniesWorked']);
sns.distplot(data1['TotalWorkingYears']);
sns.distplot(data1['TrainingTimesLastYear']);
sns.distplot(data1['WorkLifeBalance']);
sns.distplot(data1['YearsAtCompany']);
sns.distplot(data1['YearsInCurrentRole']);
sns.distplot(data1['YearsSinceLastPromotion']);
sns.distplot(data1['YearsWithCurrManager']);
# Reassign target
data1.Attrition.replace(to_replace = dict(Yes = 1, No = 0), inplace = True)
Count of Attrition
sns.countplot(data1['Attrition']);
plt.figure(figsize=(8,4))
ax = sns.barplot(y="Age", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="DistanceFromHome", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="MonthlyIncome", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="NumCompaniesWorked", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="TotalWorkingYears", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="TrainingTimesLastYear", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="WorkLifeBalance", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="YearsAtCompany", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="YearsInCurrentRole", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="YearsSinceLastPromotion", x="Attrition", data=data1)
plt.figure(figsize=(8,4))
ax = sns.barplot(y="YearsWithCurrManager", x="Attrition", data=data1)
Degree of Relationship between target variable and feature: Attrition rate is high when the YearsWithCurrManager is lower(newer). Attrition rate is high when the YearsInCurrentRole is lower(newer). Attrition rate is high when the YearsAtCompany is lower(newer).Attrition rate is high when the TotalWorkingYears is lower(newer). Attrition rate is high when the NumCompaniesWorked is more. Attrition rate is high when the DistanceFromHome is higher.
sns.pairplot(data1)
for i in ['Age','DistanceFromHome','MonthlyIncome','NumCompaniesWorked','PercentSalaryHike','TotalWorkingYears','TrainingTimesLastYear','WorkLifeBalance','YearsAtCompany','YearsInCurrentRole','YearsSinceLastPromotion','YearsWithCurrManager']:
sns.boxplot(x='Attrition',y=i,data=data1)
plt.show()
fig,ax = plt.subplots(figsize=(10,7))
sns.violinplot(x='Gender', y='MonthlyIncome',hue='Attrition',split=True,data=data1)
Gender ditribution does not seems to make much difference to attrition.
f,ax=plt.subplots(figsize=(20,20))
sns.heatmap(data1.corr(),annot=True,linewidths=0.5,linecolor="green",fmt=".1f",ax=ax)
plt.show()
sns.boxplot(x='MonthlyIncome',y='JobRole',data=data1)
data1.columns
data1.drop(['EmployeeNumber', 'Over18', 'StandardHours'],axis="columns", inplace=True)
data1.shape
df_left = data1[data1['Attrition'] == 1]
df_stay = data1[data1['Attrition'] == 0]
print(f'Percentage of employees left the Company : {(len(df_left) / len(data1))*100} %')
print(f'Percentage of employees Stayed the Company : {(len(df_stay) / len(data1))*100} %')
x_cat = data1.select_dtypes(include='object')
x_cat
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
x_cat = onehotencoder.fit_transform(x_cat).toarray()
x_cat = pd.DataFrame(x_cat)
x_cat
x_numerical = data1.select_dtypes(exclude='object')
x_numerical.drop('Attrition',axis=1,inplace=True)
x_numerical
x_data = pd.concat([x_cat,x_numerical],axis=1)
x_data
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler()
x = scalar.fit_transform(x_data)
x
y = data1['Attrition']
y
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(f'Shape of X train :{X_train.shape}')
print(f'Shape of X test :{X_test.shape}')
print(f'Shape of y train :{y_train.shape}')
print(f'Shape of y test :{y_test.shape}')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix,classification_report
print(f'Accuracy : {100 * accuracy_score(y_pred,y_test)}')
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(cm,annot=True,fmt='d')
print('Classification report')
print('======='*10)
print(classification_report(y_pred,y_test))
print('======='*10)
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier()
randomforest.fit(X_train,y_train)
y_pred = randomforest.predict(X_test)
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(cm,annot=True,fmt='d')
print('Classification report')
print('======='*10)
print(classification_report(y_pred,y_test))
print('======='*10)
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(cm,annot=True,fmt='d')
print('Classification report')
print('======='*10)
print(classification_report(y_pred,y_test))
print('======='*10)
!pip3 install -U imbalanced-learn
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=27,sampling_strategy=1.0)
X_train,y_train = sm.fit_sample(X_train,y_train)
smote_logistic = LogisticRegression()
smote_logistic.fit(X_train,y_train)
smote_pred = smote_logistic.predict(X_test)
sns.heatmap(confusion_matrix(y_test,smote_pred),annot=True,fmt='.2f',cmap='YlGnBu')
plt.show()
print('Classification report')
print('======='*10)
print(classification_report(smote_pred,y_test))
print('======='*10)
randomforest = RandomForestClassifier()
randomforest.fit(X_train,y_train)
y_pred_smote = randomforest.predict(X_test)
sns.heatmap(confusion_matrix(y_pred_smote,y_test),annot=True,fmt='d',cmap='YlGnBu')
plt.show()
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='roc_auc')
print(results)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
Techniques employed to squeeze that extra performance out of the model without making it over fit or under fit. Use Grid Search or Random Search on any of the two models used above. Make a DataFrame to compare models after hyperparameter tuning and their metrics as above.
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
# build a classifier
clf = RandomForestClassifier(n_estimators=50)
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
"max_features": sp_randint(1, 11),
"min_samples_split": sp_randint(2, 11),
"min_samples_leaf": sp_randint(1, 11),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
# run randomized search
samples = 10 # number of random samples
randomCV = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=samples) #default cv = 3
randomCV.fit(X_train, y_train)
print(randomCV.best_params_)